{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split,cross_val_score,KFold\n",
    "from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn import metrics\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "train_df=pd.read_csv('./titanic/train.csv')\n",
    "test_df=pd.read_csv('./titanic/test.csv')\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "#重复上一节的操作...\n",
    "train_df['Cabin'].fillna('missing',inplace=True)\n",
    "test_df['Cabin'].fillna('missing',inplace=True)\n",
    "train_df['Embarked'].fillna(train_df['Embarked'].mode()[0],inplace=True)\n",
    "train_df['Age'].fillna(train_df['Age'].mean(),inplace=True)\n",
    "test_df['Age'].fillna(train_df['Age'].mean(),inplace=True)\n",
    "test_df['Fare'].fillna(train_df['Fare'].mean(),inplace=True)\n",
    "import category_encoders as ce\n",
    "del train_df['Name']\n",
    "del train_df['Ticket']\n",
    "del test_df['Name']\n",
    "del test_df['Ticket']\n",
    "del train_df['PassengerId']\n",
    "del test_df['PassengerId']\n",
    "label=train_df[\"Survived\"]\n",
    "del train_df[\"Survived\"]\n",
    "# target \n",
    "target_encoder = ce.TargetEncoder(cols=['Embarked','Cabin']).fit(train_df,label)\n",
    "train_df=target_encoder.transform(train_df)\n",
    "test_df=target_encoder.transform(test_df)\n",
    "\n",
    "# one hot\n",
    "onehot_encoder = ce.OneHotEncoder(cols=['Sex']).fit(train_df)\n",
    "train_df=onehot_encoder.transform(train_df)\n",
    "test_df=onehot_encoder.transform(test_df)\n",
    "\n",
    "from sklearn.preprocessing import StandardScaler,MinMaxScaler,Normalizer\n",
    "#z-score归一化为例\n",
    "standard_scaler=StandardScaler()\n",
    "standard_scaler.fit(train_df)\n",
    "new_train_df=pd.DataFrame(standard_scaler.transform(train_df),columns=train_df.columns)\n",
    "new_test_df=pd.DataFrame(standard_scaler.transform(test_df),columns=train_df.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "poly=PolynomialFeatures(degree=2,include_bias=False,interaction_only=False)#\n",
    "poly_fea_np=poly.fit_transform(train_df)\n",
    "poly_fea_df=pd.DataFrame(poly_fea_np,columns=poly.get_feature_names())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "poly_fea_test_np=poly.transform(test_df)\n",
    "poly_fea_test_df=pd.DataFrame(poly_fea_test_np,columns=poly.get_feature_names())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "主要考虑提供更多数据给模型训练，包括两方面：  \n",
    "\n",
    "（1）利用其余的未标记数据进行无监督学习，在我们的标记数据进行监督学习（半监督学习），比如nlp任务中收集海量的文本数据训练embedding，然后再在其他nlp任务上做fine tuning；  \n",
    "\n",
    "（2）在当前数据的基础上造出相似的数据，比如nlp任务中删除某一个词、替换同义词...，cv任务中缩放、旋转、翻转图片、gan..."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 一.半监督学习\n",
    "这里没有多余的feature数据，我们假设test部分就是多出来的部分；   \n",
    "在pca上做对比..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.7653568059407474, 0.02975655731469058)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#增强前\n",
    "from sklearn.decomposition import PCA\n",
    "X_pca=PCA(n_components=20).fit_transform(poly_fea_df)\n",
    "#gbdt\n",
    "classifier=GradientBoostingClassifier()\n",
    "scores = cross_val_score(classifier,X_pca, label, scoring='f1', cv = 5)\n",
    "np.mean(scores),np.std(scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.7748675479764392, 0.023371128453025514)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#增强后\n",
    "X_pca=PCA(n_components=20).fit_transform(np.concatenate([poly_fea_df,poly_fea_test_df]))\n",
    "#gbdt\n",
    "classifier=GradientBoostingClassifier()\n",
    "scores = cross_val_score(classifier,X_pca[:891], label, scoring='f1', cv = 5)\n",
    "np.mean(scores),np.std(scores)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "其他ae,kmeans都可以尝试..."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 二.过采样"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(0.7764333124477861, 0.061403220779861586)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from imblearn.over_sampling import SMOTE\n",
    "kfold= KFold(n_splits=5,random_state=42,shuffle=True)\n",
    "scores=[]\n",
    "for train_index,test_index in kfold.split(poly_fea_df,label):\n",
    "    X_train=poly_fea_df.loc[train_index]\n",
    "    y_train=label[train_index]\n",
    "    X_test=poly_fea_df.loc[test_index]\n",
    "    y_test=label[test_index]\n",
    "    \n",
    "    X_resampled,y_resampled=SMOTE(k_neighbors=5).fit_sample(X_train,y_train)\n",
    "    \n",
    "    gbdt=GradientBoostingClassifier()\n",
    "    gbdt.fit(X_resampled,y_resampled)\n",
    "    y_predict=gbdt.predict(X_test)\n",
    "    f1_score=metrics.f1_score(y_test,y_predict)\n",
    "    scores.append(f1_score)\n",
    "np.mean(scores),np.std(scores)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 三.自定义规则\n",
    "对每条训练数据做如下操作：  \n",
    "（1）随机删掉某个特征（0替换）；  \n",
    "（2）随机交换同class的某个特征的值；  \n",
    "（3）随机交换非class的某个特征的值； "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import copy\n",
    "import random\n",
    "def extend_data(train_df,train_y):\n",
    "    #删除操作\n",
    "    rows,cols=train_df.shape\n",
    "    delete_df=copy.deepcopy(train_df)\n",
    "    for i in range(0,rows):\n",
    "        j=random.choice(range(0,cols))\n",
    "        delete_df.iloc[i,j]=0#注意：要用iloc[i,j]的方式才能成功赋值，loc[i,j],iloc[i][j],iloc[i,j]的方式都不行\n",
    "    #替换操作\n",
    "    replace_df=copy.deepcopy(train_df)\n",
    "    zero_class_df=train_df[train_y==0]\n",
    "    one_class_df=train_df[train_y==1]\n",
    "    zero_rows,_=zero_class_df.shape\n",
    "    one_rows,_=one_class_df.shape\n",
    "    for i in range(0,rows):\n",
    "        j=random.choice(range(0,cols))\n",
    "        if train_y.tolist()[i]==0:\n",
    "            new_i=random.choice(range(0,zero_rows))\n",
    "            replace_df.iloc[i,j]=zero_class_df.iloc[new_i,j]\n",
    "        else:\n",
    "            new_i=random.choice(range(0,one_rows))\n",
    "            replace_df.iloc[i,j]=one_class_df.iloc[new_i,j]\n",
    "    #替换操作\n",
    "    replace_df2=copy.deepcopy(train_df)\n",
    "    for i in range(0,rows):\n",
    "        j=random.choice(range(0,cols))\n",
    "        if train_y.tolist()[i]==0:\n",
    "            new_i=random.choice(range(0,one_rows))\n",
    "            replace_df2.iloc[i,j]=one_class_df.iloc[new_i,j]\n",
    "        else:\n",
    "            new_i=random.choice(range(0,zero_rows))\n",
    "            replace_df2.iloc[i,j]=zero_class_df.iloc[new_i,j]\n",
    "    #合并\n",
    "    return pd.concat([train_df,delete_df,replace_df,replace_df2]),train_y.tolist()*4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.772241111484664, 0.052028637093507364)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kfold= KFold(n_splits=5,random_state=42,shuffle=True)\n",
    "scores=[]\n",
    "for train_index,test_index in kfold.split(poly_fea_df,label):\n",
    "    X_train=poly_fea_df.loc[train_index]\n",
    "    y_train=label[train_index]\n",
    "    X_test=poly_fea_df.loc[test_index]\n",
    "    y_test=label[test_index]\n",
    "    \n",
    "    X_extended,y_extended=extend_data(X_train,y_train)\n",
    "    X_extended2,y_extended2=extend_data(X_train,y_train)\n",
    "    X_extended3,y_extended3=extend_data(X_train,y_train)\n",
    "    \n",
    "    gbdt=GradientBoostingClassifier()\n",
    "    gbdt.fit(pd.concat([X_train,X_extended,X_extended2,X_extended3]),y_train.tolist()+y_extended+y_extended2+y_extended3)\n",
    "    y_predict=gbdt.predict(X_test)\n",
    "    f1_score=metrics.f1_score(y_test,y_predict)\n",
    "    scores.append(f1_score)\n",
    "np.mean(scores),np.std(scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
